The data

We take an exploratory approach to understand our omics-data better. In the “Data decription” section the creation of the data is explained while the “Data analysis” sections performs several statistical tests, which include Shapiro-Wilks test for normality, T-test and Mann-Whitney U test for significance, and exploratory plots, which include box plots, volcano plots, SVD plots, and MDS plots. Lastly we explore the data with K-means clustering.

library(tidyverse)
library(rstatix)
library(factoextra)
library(ChAMP)
library(RColorBrewer)
library(limma)
library(ComplexHeatmap)
library(ggrepel)
pal <- brewer.pal(8,"Dark2")
metabolites <- read_csv("results/metabolite_preprocessed.csv")
proteins <- read_csv("results/protein_preprocessed.csv") %>%
             dplyr::rename("patient_id" = "Patient_ID")
miRNA <- read_csv("results/miRNA_preprocessed.csv") 
metadata <- read_csv("results/metadata_preprocessed.csv")
metadata_proteins <- read_csv("data/metadata_OlinkID.csv") %>%
                     pivot_longer(cols = c(-1), names_to = "proteins") %>%
                     pivot_wider(names_from = c(1))
metadata_metabolites <- read_csv("data/metadata_metabolomics.csv")
apply_shapiro <- function(data, variables, significance) {
# 1. Running the Shapiro Wilk test
results <- data %>%
  shapiro_test(variables) 
# 2. Filtering for the significant p-values 
results %>% 
  filter(p < significance) %>% 
  nrow()
}
apply_ttest <- function(data, grp1, grp2) {
# 1. Calculating p-values
data %>% 
  select(-1) %>% 
  apply(2, function(x) {
    samp1 <- x[grp1]
    samp2 <- x[grp2]
    test_result <- t.test(samp1, samp2)
    return(test_result$p.value)
  })
}
apply_mwu <- function(data, grp1, grp2) {
# 1. Calculating p-values
mwu_p <- data %>% 
  select(-1) %>% 
  apply(2, function(x) {
    samp1 <- x[grp1]
    samp2 <- x[grp2]
    test_result <- wilcox.test(samp1, samp2)
    return(test_result$p.value)})

# 2. Calculating fold change
fold_change <- data %>% 
  select(-1) %>% 
  apply(2, function(x){
    samp1 <- x[grp1]
    samp2 <- x[grp2]
    # test_result <- mean(log2(x[seq_along(grp1)])) - mean(log2(x[(length(grp1) + 1):ncol(data)]))
    test_result <- mean(samp1) - mean(samp2)
    return(test_result)})

# 3. Combining to table
variables <- data %>% 
  select(-1) %>% 
  names()

mwu_table <- tibble(
  variable = variables,
  p.value = mwu_p,
  adjusted_p.value = p.adjust(mwu_p, method = "BH"),
  fold_change = fold_change)}
plot_volcano <- function(data, significance) {
  data %>% 
    ggplot(
      aes(
        x = fold_change,
        y = -log10(p.value),
        color = p.value < significance
      )) +
      geom_point() +
    labs(
      x = "Fold change (log2)",
      y = "P-value (-log10)",
      color = str_glue("p < {significance}")
    )
}
plot_densities <- function(data, title) {
data %>% 
  select(2:ncol(data)) %>%
  pivot_longer(
    everything(),
    names_to = "variable",
    values_to = "value"
  ) %>% 
  ggplot(mapping = aes(
    x = value,
    y = after_stat(scaled),
    color = variable
  )) +
    geom_density() +
    guides(color = "none") +
    labs(
      x = "Value",
      y = "Density",
      title = title
  )}

miRNA

Data description

The miRNA data is analysed for 36 samples, 33 patients and 3 healthy donors.

Method: NGS

Preprocessing: The readings are mapped to known human miRNA from the miRBase data base. !(The data output comes out in the form of counts - no modification). The facility also provides mature miRNA log transformed values (that we use here). Constant values of this dataset were removed.

Data analysis

Initial explorataion

Checking sample’s outliers

# Plotting the samples/individuals
# pdf(file = "results/plots/miRNA_box_plot.pdf", wi = 9, he = 6)
miRNA %>% 
  select(-patient_id) %>% 
  t() %>% 
  boxplot(
    col = palette()[-1],
    main = "miRNA distribtution", xlab = "Samples", ylab = "miRNA concentration ")

Shapiro-Wilks test

miRNA expression data often does not follow a normal distribution, even after log transformation. The number of miRNA that do not satisfy normality at p = 0.05 is: 1420

# Applying Shapiro Wilks test for normality
variables <- miRNA %>% 
  select(-patient_id) %>% 
  names()
apply_shapiro(miRNA, variables, 0.05)
## [1] 2873

T-test

The number of miRNA that were significantly different between PSC samples and controls at p = 0.05 is: 506

# Calculating the p-values
p_values <- miRNA %>% 
  apply_ttest(grp1 = c(1:33), grp2 = c(34:36))
hist(p_values)

# Calculating the amount of significant proteins
sum(p_values < 0.05)
## [1] 1107

Mann-Whitney U test and Volcano plot

# pdf(file = "results/plots/miRNA_p_histogram.pdf", wi = 9, he = 6)
mwu <- apply_mwu(miRNA, grp1 = c(1:33), grp2 = c(34:36))
hist(mwu)   

datatable(mwu %>%
            mutate(fold_change = abs(fold_change)),
          filter =  "top",
          extensions = 'Buttons', 
    options = list(dom = 'Bfrtip', 
                   buttons = c('excel', "csv"))) %>%
   formatRound(columns = 2:4, digits=6)

The number of significantly differentially expressed miRNA at p < 0.05 is 341, of which 312 are upregulated and 29 are downregulated.

# Filter for significant miRNAs (customize the conditions if needed)
# Add a new column for miRNAs to label based on the thresholds
mwu$label <- ifelse((mwu$p.value < 0.05 & (mwu$fold_change < -2.5 | mwu$fold_change > 8.5) | mwu$p.value < 0.00029 & mwu$fold_change > 6.5), mwu$variable, NA)

# pdf(file = "results/plots/volcano_miRNA.pdf")
ggplot(mwu, aes(x = fold_change, y = -log10(p.value), color = ifelse(p.value < 0.05, "Significant", "Non-significant"))) +
  geom_point(alpha = 0.8, size = 1.5) +
  geom_label_repel(aes(label = label), 
                  size = 3, 
                  box.padding = 0.3, 
                  max.overlaps = 10,
                  color = "black",
                  show.legend = FALSE) +
  scale_color_manual(values = c("Non-significant" = "royalblue", "Significant" = "firebrick")) +
  theme_minimal(base_size = 14) +
  theme(panel.background = element_rect(fill = "lightgray", color = NA)) +
  labs(x = "Fold Change", y = "-Log10(p-value)", color = "p < 0.05") +
  ggtitle("Volcano Plot of Mann-Whitney test of miRNAs")

SVD

We check the plausible confounders and the studied effects in the data

beta1 <- miRNA %>% 
  dplyr::slice(1:33) %>%
  dplyr::select(-patient_id) %>%
  t() %>% 
  as.data.frame()

pd1 <- metadata %>% 
  dplyr::slice(1:33) %>%
  select(cca_binary, ibd_binary, crohn_or_uc, fibrosis, fibrosis_binary, alp, alp_binary, bilirubin, bilirubin_binary, group, sex, age, age_cat, bmi, bmi_cat, crp, alat, IgG, IgA, auto_anb, overlap_aih, steroids_other_medic, jaundice, pruritis, fever_chill, fatigue, no_symptoms) %>%
  as.data.frame()
  
pdf(file = "results/plots/svd_miRNA.pdf")
champ.SVD(beta = beta1, pd = pd1)
 # while (!is.null(dev.list()))  dev.off()

MDS plot of groups

Using MDS plots to check the grouping of the data

# pdf(file = "results/plots/MDS_miRNA.pdf", wi = 9, he = 6)
miRNA %>%
  dplyr::select(-patient_id) %>%
  dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(1, 2),
    col=pal[factor(metadata$group[1:33])])

miRNA %>%
  dplyr::select(-patient_id) %>%
  dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(1, 3),
    col=pal[factor(metadata$group[1:33])])

miRNA %>%
  dplyr::select(-patient_id) %>%
  dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(2, 3),
    col=pal[factor(metadata$group[1:33])])

Proteins

Data description

The protein data is analysed for 36 samples, 33 patients and 3 healthy donors.

Method: Olink’s proximity extension assay (PEA) is used in which each protein are bound by two antibodies with tail DNA oligonucleotides. Upon binding the oligonucleotides hybridize and are extended by DNA polymerase forming a barcode that can be read by qPCR or NGS. The PEA was done with Olink Target 96, 7 panels of proteins with 92 proteins each, total of 644 proteins. The panels are “CELL REGULATION”, “DEVELOPMENT”, “IMMUNE RESPONSE”, “INFLAMMATION”, “METABOLISM”, “ONCOLOGY II”, “ONCOLOGY III”, “ORGAN DAMAGE”.

Preprocessing: Only one sample did not pass the quality control, sample ID 30 for “ONCOLOGY III”. Several proteins did not pass quality control and were removed (failed in 8% or more samples, see proteins_preprocessing.R). The data comes in the unit of normalized protein expression (NPX) values on a log2 scale. They are normalized to interplate controls.

Links Olink Target 96: https://olink.com/products/olink-target-96 Olink preprocessing for qPCR: https://7074596.fs1.hubspotusercontent-na1.net/hubfs/7074596/05-white%20paper%20for%20website/1096-olink-data-normalization-white-paper.pdf Panel proteins: https://www.bioxpedia.com/olink-proteomics/#:~:text=The%20readout%20for%20most%20Olink,equals%20a%20high%20protein%20concentration.

Data analysis

Initial exploration

Preliminary Checks:

Visualize your log-transformed data (e.g., histograms, density plots) to ensure the transformation effectively reduces skewness. If outliers are present, consider their biological relevance. The Mann-Whitney test is robust to outliers but may still be influenced by their ranks.

# Plotting the samples/individuals
proteins %>%
  select(-patient_id) %>%
  t() %>%
  limma::plotDensities()

# 
# Plotting the features/variables
# proteins %>%
#   select(-patient_id) %>%
#   limma::plotDensities()

Checking sample and data for outliers

# Plotting the samples/individuals
# pdf(file = "results/plots/proteins_barplot.pdf", wi = 9, he = 6)
proteins %>% 
  select(-patient_id) %>% 
  t() %>% 
  boxplot(col = palette()[-1],
    main = "Protein distribtution", xlab = "Samples", ylab = "Protein concentration")

# Plotting the features/variables
# proteins %>% 
#   select(-patient_id) %>% 
#   boxplot(col = palette()[-1],
#     main = "Protein distribtution", ylab = "Protein concentration ")

MSD

# Plotting the features/variables
proteins %>%
  select(-patient_id) %>%
  limma::plotMDS()

Shapiro-Wilks test

The number of proteins that do not satisfy normality at p = 0.05 is:

# Applying Shapirot_Wilks test for normality
variables <- proteins %>% 
  select(-patient_id) %>% 
  names()
apply_shapiro(proteins, variables, 0.05)
## [1] 232

T-test

The number of proteins that were significantly different between PSC samples and controls at p = 0.05 is: 257

p_values <- proteins %>% 
  apply_ttest(grp1 = c(1:33), grp2 = c(34:36))
# pdf(file = "results/plots/proteins_histogram.pdf", wi = 9, he = 6)
hist(p_values)

# Calculating the amount of significant proteins
sum(p_values < 0.05)
## [1] 45
# pdf(file = "results/plots/proteins_p_histogram.pdf", wi = 9, he = 6)
mwu <- apply_mwu(proteins, grp1 = c(1:33), grp2 = c(34:36))
hist(mwu)   

datatable(mwu %>%
            inner_join(metadata_proteins %>%
                         dplyr::select(proteins, Assay),
                       by = c("variable" = "proteins")) %>%
                         dplyr::select(-variable) %>%
                         relocate(Assay),
          filter =  "top",
          extensions = 'Buttons',
    options = list(dom = 'Bfrtip',
                   buttons = c('excel', "csv"))) %>%
   formatRound(columns = 2:4, digits=6)

The number of significantly differentially expressed proteins at p < 0.05 is 22, of which 19 are upregulated and 3 are downregulated.

Mann-Whitney U test and Volcano plot

# pdf(file = "results/plots/proteins_volcano.pdf", wi = 9, he = 6)
# Add a new column for proteins to label based on the thresholds
mwu <- mwu %>%
            inner_join(metadata_proteins %>%
                         dplyr::select(proteins, Assay),
                       by = c("variable" = "proteins"))
mwu$label <- ifelse((mwu$p.value < 0.05 & (mwu$fold_change < -1 | mwu$fold_change > 1) | mwu$p.value < 0.01 ), mwu$Assay, NA)
ggplot(mwu, aes(x = fold_change, y = -log10(p.value), color = ifelse(p.value < 0.05, "Significant", "Non-significant"))) +
  geom_point(alpha = 0.8, size = 1.5) +
  geom_label_repel(aes(label = label), 
                  size = 3, 
                  box.padding = 0.3, 
                  max.overlaps = 10,
                  color = "black",
                  show.legend = FALSE) +
  scale_color_manual(values = c("Non-significant" = "royalblue", "Significant" = "firebrick")) +
  theme_minimal(base_size = 14) +
  theme(panel.background = element_rect(fill = "lightgray", color = NA)) +
  labs(x = "Fold Change", y = "-Log10(p-value)", color = "p < 0.05") +
  ggtitle("Volcano Plot of Mann-Whitney test of proteins")

SVD

We check the plausible confounders and the studied effects in the data

beta1 <- proteins %>% 
  dplyr::slice(1:33) %>%       ### no Control
  dplyr::select(-patient_id) %>%
  t() %>% 
  as.data.frame() 

pd1 <- metadata %>% 
  dplyr::slice(1:33) %>%
  select(cca_binary, ibd_binary, crohn_or_uc, fibrosis, fibrosis_binary, alp, alp_binary, bilirubin, bilirubin_binary, group, sex, age, age_cat, bmi, bmi_cat, crp, alat, IgG, IgA, auto_anb, overlap_aih, steroids_other_medic, jaundice, pruritis, fever_chill, fatigue, no_symptoms) %>%
  as.data.frame()

# pdf(file = "results/plots/svd_proteins.pdf")
champ.SVD(beta = beta1, pd = pd1)

# while (!is.null(dev.list()))  dev.off()

MSD multiple dimensions

Effect of age category

Using MDS plots to check the grouping of the data according to the confouders found by SVD: colors represent age categories: No much effect of age categories

# pdf(file = "results/plots/MDS_by_age_proteins.pdf")
proteins %>%
  dplyr::select(-patient_id) %>%
  # dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(1, 2),
    col=pal[factor(metadata$alp_binary[1:33])])

proteins %>%
  dplyr::select(-patient_id) %>%
  # dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(1, 3),
    col=pal[factor(metadata$alp_binary[1:33])])

proteins %>%
  dplyr::select(-patient_id) %>%
  # dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(2, 3),
    col=pal[factor(metadata$alp_binary[1:33])])

Effect of groups

Looking at the groups (early PSC, CCA, IBD, Advanced and progressing): there is some grouping but not good enough: some feature selection may be needed.

# pdf(file = "results/plots/MDS_proteins.pdf")
proteins %>%
  dplyr::select(-patient_id) %>%
  dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(1, 2),
    col=pal[factor(metadata$group[1:33])])

proteins %>%
  dplyr::select(-patient_id) %>%
  dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(1, 3),
    col=pal[factor(metadata$group[1:33])])

proteins %>%
  dplyr::select(-patient_id) %>%
  dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(2, 3),
    col=pal[factor(metadata$group[1:33])])

Metabolites

Data description

The metabolite data is analysed for 45 samples, 33 patients and 12 healthy donors.

Method: Ultra performance liquid chromatography + Tandem mass spectroscopy.

Preprocessing: The values are log transformed (base unknown). The missing values are imputated with the minimum observed value of that respective compound. The data is normalized so that the median/medians are 1.0000 and all other values are proportional.

Data analysis

log2_metabolites <- cbind(metabolites[1], 
  log2(metabolites[2:length(metabolites)]))

Initil plots

# Plotting the samples/individuals
log2_metabolites %>%
  select(-patient_id) %>%
  t() %>%
  limma::plotDensities()

# 
# # Plotting the features/variables
# proteins %>% 
#   select(-patient_id) %>% 
#   limma::plotDensities()

Checking sample’s outliers

# pdf(file = "results/plots/barplots_metabolites.pdf")
log2_metabolites %>% 
  select(-patient_id) %>% 
  t() %>% 
  boxplot(
    col = palette()[-1],
    main = "Metabolite distribtution", xlab = "Samples", ylab = "Metabolite concentration ")

# Plotting the features/variables
# log2_metabolites %>% 
#   select(-patient_id) %>% 
#   boxplot(
#     col = palette()[-1],
#     main = "Metabolite distribtution", ylab = "Metabolite concentration ")

Shapiro-Wilks test

The number of metabolites that do not satisfy normality at p = 0.05 is: 524

# Applying Shapirot_Wilks test for normality
variables <- log2_metabolites %>% 
  select(-patient_id) %>% 
  names()
apply_shapiro(log2_metabolites, variables, 0.05)
## [1] 524

T-test

The number of metabolites that were significantly different between PSC samples and controls at p = 0.05 is: 391

# Calculating the p-values
p_values <- metabolites %>% 
  apply_ttest(grp1 = c(1:33), grp2 = c(34:45))

# Plotting the distribution
hist(p_values)

# Calculating the amount of significant proteins
sum(p_values < 0.05)
## [1] 391

Mann-Whitney U test and Volcano plot

# pdf(file = "results/plots/p_val_histogram_metabolites.pdf")
mwu <- apply_mwu(log2_metabolites, grp1 = c(1:33), grp2 = c(34:45))
hist(mwu)

mwu <- mwu %>%
            full_join(metadata_metabolites %>% 
                      select(BIOCHEMICAL, 'SUPER PATHWAY', 'COMP ID') %>%
                        mutate_at("COMP ID", as.character),
                      by = c("variable" = "COMP ID"))

datatable(mwu,
          filter =  "top",
          extensions = 'Buttons', 
    options = list(dom = 'Bfrtip', 
                   buttons = c('excel', "csv")))

The number of significantly differentially expressed miRNA at p < 0.01 is 276, of which 95 are upregulated and 223 are downregulated.

  mwu$label <- ifelse((mwu$p.value < 0.01 & (mwu$fold_change < -3 | mwu$fold_change > 5.2) | mwu$p.value < 0.0000000001 ), mwu$`SUPER PATHWAY`, NA)

# pdf(file = "results/plots/volcano_metabolites.pdf")
ggplot(mwu, aes(x = fold_change, y = -log10(p.value), color = ifelse(p.value < 0.05, "Significant", "Non-significant"))) +
  geom_point(alpha = 0.8, size = 1.5) +
  geom_label_repel(aes(label = label), 
                  size = 3, 
                  box.padding = 0.3, 
                  max.overlaps = 10,
                  color = "black",
                  show.legend = FALSE) +
  scale_color_manual(values = c("Non-significant" = "royalblue", "Significant" = "firebrick")) +
  theme_minimal(base_size = 14) +
  theme(panel.background = element_rect(fill = "lightgray", color = NA)) +
  labs(x = "Fold Change", y = "-Log10(p-value)", color = "p < 0.01") +
  ggtitle("Volcano Plot of Mann-Whitney test of metabolites")

SVD

We check the plausible confounders and the studied effects in the data

beta1 <- log2_metabolites %>% 
  dplyr::slice(1:36) %>% 
  dplyr::select(-patient_id) %>%
  t() %>% 
  as.data.frame()

pd1 <- metadata %>% 
  # dplyr::slice(1:33) %>%
  select(cca_binary, ibd_binary, crohn_or_uc, fibrosis, fibrosis_binary, alp, alp_binary, bilirubin, bilirubin_binary, group, sex, age, age_cat, bmi, bmi_cat, crp, alat, IgG, IgA, auto_anb, overlap_aih, steroids_other_medic, jaundice, pruritis, fever_chill, fatigue, no_symptoms) %>%
  as.data.frame()

pdf(file = "results/plots/svd_metabolomics.pdf")
champ.SVD(beta = beta1, pd = pd1)
 # while (!is.null(dev.list()))  dev.off()

MDS multiple dimensions

Using MDS plots to check the grouping of the data according to the confouders found by SVD. Age effect is clearly visible in the data. It may be necessary to adjust for it or to do some feature selection.

Age category effect

# pdf(file = "results/plots/metabolites_MDS_by_age.pdf")
log2_metabolites %>%
  dplyr::select(-patient_id) %>%
  # dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(1, 2),
    col=pal[factor(metadata$bilirubin_binary[1:33])])

log2_metabolites %>%
  dplyr::select(-patient_id) %>%
  # dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(1, 3),
    col=pal[factor(metadata$bilirubin_binary[1:33])])

log2_metabolites %>%
  dplyr::select(-patient_id) %>%
  # dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(2, 3),
    col=pal[factor(metadata$bilirubin_binary[1:33])])

Looking at the groups (early PSC, CCA, IBD, Advanced and progressing): there is some grouping but not good enough: some feature selection may be needed.

Effect of groups

# pdf(file = "results/plots/metabolites_MDS.pdf")
log2_metabolites %>%
  dplyr::select(-patient_id) %>%
  dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(1, 2),
    col=pal[factor(metadata$group[1:33])])

log2_metabolites %>%
  dplyr::select(-patient_id) %>%
  dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(1, 3),
    col=pal[factor(metadata$group[1:33])])

log2_metabolites %>%
  dplyr::select(-patient_id) %>%
  dplyr::slice(1:33) %>%
  t() %>%
  plotMDS(
    top=1000,
    gene.selection="common",
    dim.plot = c(2, 3),
    col=pal[factor(metadata$group[1:33])])

Unsupervised clustering of the data

We scaled (z-score) the data before clustering.

Why Scaling Matters for K-Means: K-means calculates distances between data points using all features. If features are on different scales (e.g., miRNAs range from 0-10, proteins range from 100-10,000), features with larger magnitudes dominate the distance calculation, biasing the clustering results.

miRNA data

res_km_r <- eclust(scale(miRNA[, 2:ncol(miRNA)]), "kmeans")

fviz_gap_stat(res_km_r$gap_stat)

# pdf(file = "results/Illustrations/transcriptomics_kmean_k4.pdf")
res_km_r <- eclust(scale(miRNA %>% dplyr::select(-patient_id)), "kmeans", k = 3)

res_km_r <- eclust(scale(miRNA %>% dplyr::select(-patient_id)), FUNcluster = "hclust", k = 4, graph = TRUE)
fviz_cluster(res_km_r, scale(miRNA %>% dplyr::select(-patient_id)), ellipse.type = "ellipse", palette = c("#0b5313", "#ec4dd8", "blue", "orange"), ggtheme = theme_minimal())

Proteomics data

res_km_r <- eclust(scale(proteins %>% dplyr::select(-patient_id)), "kmeans")

# pdf(file = "results/plots/proteomics_kmean_k3.pdf")
fviz_gap_stat(res_km_r$gap_stat)

res_km_r <- eclust(scale(proteins %>% dplyr::select(-patient_id)), "kmeans", k = 3) 

res_km_r <- eclust(scale(proteins %>% dplyr::select(-patient_id)), FUNcluster = "hclust", k = 3, graph = TRUE)
fviz_cluster(res_km_r, scale(proteins %>% dplyr::select(-patient_id)), ellipse.type = "ellipse", palette = c("#0b5313", "#ec4dd8", "blue", "orange"), ggtheme = theme_minimal())

Metabolomics

Scaled, Best k = 4

res_km_r <- eclust(scale(log2_metabolites %>% dplyr::select(-patient_id)), "kmeans")

# pdf(file = "results/plots/metabolites_kmean_k3.pdf")
fviz_gap_stat(res_km_r$gap_stat)

res_km_r <- eclust(scale(log2_metabolites %>% dplyr::select(-patient_id)), "kmeans", k = 3) 

res_km_r <- eclust(scale(log2_metabolites), FUNcluster = "hclust", k = 3, graph = TRUE)
fviz_cluster(res_km_r, scale(log2_metabolites %>% dplyr::select(-patient_id)), ellipse.type = "ellipse", palette = c("#0b5313", "#ec4dd8", "blue", "orange"), ggtheme = theme_minimal())

Non scaled

res_km_r <- eclust(log2_metabolites %>% dplyr::select(-patient_id), "kmeans")

fviz_gap_stat(res_km_r$gap_stat)

# pdf(file = "results/Paper_illustratios/kmean_lasso_grpcolor.pdf")
res_km_r <- eclust(log2_metabolites%>% dplyr::select(-patient_id), "kmeans", k = 4) 

# while (!is.null(dev.list()))  dev.off()

res_km_r <- eclust(log2_metabolites%>% dplyr::select(-patient_id), FUNcluster = "hclust", k = 4, graph = TRUE)
fviz_cluster(res_km_r, log2_metabolites%>% dplyr::select(-patient_id), ellipse.type = "ellipse", palette = c("#0b5313", "#ec4dd8", "blue", "orange"), ggtheme = theme_minimal())